library(readxl)
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
#1. Create data frame "dfCountries", load countries population file in it.
dfCountries <- read.csv("Countries Population.csv")
#2. Sort countries according to their population in ascending and descending order
# To sort countries according to population in ascending order
head(dfCountries[order(dfCountries$Total.Population.2017),])
## Country.Name Country.Code Total.Population.2017 X X.1
## 204 Tuvalu TUV 11192 NA NA
## 140 Nauru NRU 13649 NA NA
## 153 Palau PLW 21729 NA NA
## 28 British Virgin Islands VGB 31196 NA NA
## 186 St. Martin (French part) MAF 32125 NA NA
## 167 San Marino SMR 33400 NA NA
#To sort countries according to population in descending order
head(dfCountries[order(-dfCountries$Total.Population.2017),])
## Country.Name Country.Code Total.Population.2017 X X.1
## 42 China CHN 1386395000 NA NA
## 91 India IND 1339180127 NA NA
## 209 United States USA 325719178 NA NA
## 92 Indonesia IDN 263991379 NA NA
## 27 Brazil BRA 209288278 NA NA
## 152 Pakistan PAK 197015955 NA NA
#3. A vector of countries with population more than 10000000
countries <- dfCountries$Country.Name[dfCountries$Total.Population.2017 > 10000000]
#4. Create a dataframe "dfBigAndSmall" that has countries with population greater than 10M and less than 2M.
dfBigAndSmall <- filter(dfCountries, Total.Population.2017 > 1000000 & Total.Population.2017 < 2000000)
head(dfBigAndSmall)
## Country.Name Country.Code Total.Population.2017 X X.1
## 1 Bahrain BHR 1492584 NA NA
## 2 Cyprus CYP 1179551 NA NA
## 3 Equatorial Guinea GNQ 1267689 NA NA
## 4 Estonia EST 1315480 NA NA
## 5 Eswatini SWZ 1367254 NA NA
## 6 Guinea-Bissau GNB 1861283 NA NA
#5. Create levels of income group from dataset "Countries region mapping" levels: Low, Lower mid, Upper mid, High
# To load countries Region Mapping dataset
df2 <- read_excel('Countries Region Mapping.xlsx')
df2$IncomeGroup <- factor(df2$IncomeGroup, levels = c("Low income", "Lower middle income", "Upper middle income", "High income"), labels = c("Low", "Lower mid", "Upper mid", "High"))
head(df2$IncomeGroup)
## [1] High Low Lower mid Upper mid High High
## Levels: Low Lower mid Upper mid High
#6. Merge the 3 datasets attached into 1 dataframe : "dfCountryMaster"
# To read countries indicators csv file
df3 <- read.csv("Countries Indicators.csv")
# To merge Countries Population and Countries Region Mapping data frames
df_merge1 <- merge(x=dfCountries, y=df2, by = "Country.Code", all=TRUE)
head(df_merge1)
## Country.Code Country.Name Total.Population.2017 X X.1
## 1 ABW Aruba 105264 NA NA
## 2 AFG Afghanistan 35530081 NA NA
## 3 AGO Angola 29784193 NA NA
## 4 ALB Albania 2873457 NA NA
## 5 AND Andorra 76965 NA NA
## 6 ARE United Arab Emirates 9400145 NA NA
## Region IncomeGroup
## 1 Latin America & Caribbean High
## 2 South Asia Low
## 3 Sub-Saharan Africa Lower mid
## 4 Europe & Central Asia Upper mid
## 5 Europe & Central Asia High
## 6 Middle East & North Africa High
#To check the class of data frame
class(df_merge1)
## [1] "data.frame"
#To merge df_merge1 and Countries Indicators data frames
dfCountryMaster <- merge(x=df_merge1, y=df3, by="Country.Code", all=TRUE)
head(dfCountryMaster)
## Country.Code Country.Name Total.Population.2017 X.x X.1.x
## 1 ABW Aruba 105264 NA NA
## 2 AFG Afghanistan 35530081 NA NA
## 3 AGO Angola 29784193 NA NA
## 4 ALB Albania 2873457 NA NA
## 5 AND Andorra 76965 NA NA
## 6 ARB <NA> NA NA NA
## Region IncomeGroup GDP.per.capita.2017
## 1 Latin America & Caribbean High 25,655.10202
## 2 South Asia Low 550.0684588
## 3 Sub-Saharan Africa Lower mid 4,100.289786
## 4 Europe & Central Asia Upper mid 4,537.579056
## 5 Europe & Central Asia High 39,146.54884
## 6 <NA> <NA> 6,239.713933
## Under.5.Mortality.Rate.2017 X.y X.1.y
## 1 NA NA NA
## 2 67.9000 NA NA
## 3 81.1000 NA NA
## 4 8.8000 NA NA
## 5 3.3000 NA NA
## 6 36.6612 NA NA
colnames(dfCountryMaster)
## [1] "Country.Code" "Country.Name"
## [3] "Total.Population.2017" "X.x"
## [5] "X.1.x" "Region"
## [7] "IncomeGroup" "GDP.per.capita.2017"
## [9] "Under.5.Mortality.Rate.2017" "X.y"
## [11] "X.1.y"
#To check dimensions of 3 datasets
dim(dfCountries)
## [1] 219 5
dim(df2)
## [1] 216 3
dim(df3)
## [1] 241 5
# To check dimensions of merged dataset
dim(dfCountryMaster)
## [1] 260 11
# To check coloumns of merged dataset
names(dfCountryMaster)
## [1] "Country.Code" "Country.Name"
## [3] "Total.Population.2017" "X.x"
## [5] "X.1.x" "Region"
## [7] "IncomeGroup" "GDP.per.capita.2017"
## [9] "Under.5.Mortality.Rate.2017" "X.y"
## [11] "X.1.y"
#7. Summarize dfCountryMaster countries by region.
dfCountryMaster %>% group_by(Region) %>% summarise( num_countries = n())
## # A tibble: 8 × 2
## Region num_countries
## <chr> <int>
## 1 East Asia & Pacific 36
## 2 Europe & Central Asia 58
## 3 Latin America & Caribbean 42
## 4 Middle East & North Africa 21
## 5 North America 3
## 6 South Asia 8
## 7 Sub-Saharan Africa 48
## 8 <NA> 44
#8. Summarize dfCountryMaster countries by region and income group.
dfCountryMaster %>% group_by(Region, IncomeGroup) %>% summarise(num_countries = n())
## # A tibble: 25 × 3
## # Groups: Region [8]
## Region IncomeGroup num_countries
## <chr> <fct> <int>
## 1 East Asia & Pacific Low 1
## 2 East Asia & Pacific Lower mid 13
## 3 East Asia & Pacific Upper mid 9
## 4 East Asia & Pacific High 13
## 5 Europe & Central Asia Low 1
## 6 Europe & Central Asia Lower mid 6
## 7 Europe & Central Asia Upper mid 14
## 8 Europe & Central Asia High 37
## 9 Latin America & Caribbean Low 1
## 10 Latin America & Caribbean Lower mid 4
## # ℹ 15 more rows
#9. Summarize dfCountryMaster countries by region. Result to have the following columns in it.
#a. Number of countries.
#b. Total population in millions.
#c. Average of GDP per capital
#d. Countries with low income.
#e. Median GDP per capital
#f. minimum and maximum mortality rate under 5.
Summary_countries_by_region <- dfCountryMaster %>% group_by(Region) %>%
summarise( num_countries = n(),
total_population_million = sum(Total.Population.2017, na.rm = TRUE)/1e6,
avg_GDP_per_capital = mean(as.numeric(GDP.per.capita.2017), na.rm = TRUE),
low_income_countries = sum(IncomeGroup == "Low income", na.rm = TRUE),
median_gdp_per_capita = median(as.numeric(GDP.per.capita.2017), na.rm = TRUE),
min_mortality_under_5 = min(Under.5.Mortality.Rate.2017, na.rm = TRUE),
max_mortality_under_5 = max(Under.5.Mortality.Rate.2017, na.rm = TRUE)
)
#10. Write the above result in csv.
write.csv(Summary_countries_by_region, file = "Country Summary by Region.csv", row.names = FALSE)
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.